perm filename TEXHYF.WEB[PAS,DEK] blob
sn#672860 filedate 1982-08-11 generic text, type C, neo UTF8
COMMENT ⊗ VALID 00010 PAGES
C REC PAGE DESCRIPTION
C00001 00001
C00002 00002 % Here is TEX material that gets inserted after \input webhdr
C00005 00003 @* Introduction.
C00011 00004 @* I/O.
C00014 00005 @* String handling.
C00019 00006 @* Linked memory.
C00028 00007 @* Hyphenation.
C00046 00008 @* Initializing the hyphenation tables.
C00074 00009 @* The main program.
C00076 00010 @* Index.
C00077 ENDMK
C⊗;
% Here is TEX material that gets inserted after \input webhdr
\magnify{1200}\setpage
\def\hang{\hangindent 3em\ \unskip\!}
\def\textindent#1{\hangindent 2.5em\noindent\hbox to 2.5em{\hss#1 }\!}
\chcode@@=13 \def@@{\penalty999\ } % ties words together
\def\TEX{T\hbox{\hskip-.1667em\lower.424ex\hbox{E}\hskip-.125em X}}
\font b=cmr9 \def\mc{\:b} % medium caps for names like PASCAL
\def\PASCAL{{\mc PASCAL}}
\font L=manfnt % font used for the METAFONT logo
\def\MF{{\:L META}\-{\:L FONT}}
\def\(#1){} % this is used to make module names sort themselves better
\font D=cmtt at 15truept % font used in the title line below (only)
\font E=cmr7 at 14truept % font used in the title line below (only)
\def\title{TEXHYF}
\def\contentspagenumber{101}
\def\topofcontents{\topspace 0pt
\vfill
\ctrline{\:E The {\:D \TEX} hyphenator}
\vskip 15pt
\ctrline{(preliminary version, in preparation)}
\vfill}
\def\botofcontents{\vfill
\ctrline{\ragged0\spaceskip0pt\xspaceskip0pt\baselineskip9pt
\hbox par 5in{\:bDon't bother reading this program,
because it is just being thrown together for quick
testing purposes, and it will soon be scrapped.}}}
\setcount0 \contentspagenumber
\topofcontents
\ctrline{(replace this page by the contents page printed later)}
\botofcontents
\mark{1}\eject
@* Introduction.
This program has been extracted from the first draft of \TEX\ in order to
make independent tests of the hyphenation algorithm in isolation.
It is not especially well organized, since it is expected to have a
very short life.
@p program TEXHYF(@!input);
label 9999;
const @<Constants in the outer block@>@/
type @<Types in the outer block@>@/
var@?@<Globals in the outer block@>@/
@#
procedure initialize; {this procedure gets things started properly}
begin @<Set initial values@>@/
end;@#
@ Here we define the syntax of an extended case statement.
@d othercases == others: {default for cases not listed explicitly}
@d endcases == @+end {follows the default case in an extended |case| statement}
@f othercases == else
@f endcases == end
@ The following parameters can be changed at compile time to extend or
reduce the capacity.
@<Constants...@>=
@!max_strings=3000; {maximum number of strings}
@!pool_size=15000; {maximum number of characters in strings, including all
error messages and help texts, and the names of all fonts and
control sequences}
@!trie_size=7000; {space for hyphenation patterns, should be larger for
\.{INITEX} than it is in production versions of \TEX}
@ @d hyph_size=307 {another prime; the number of \.{\\hyphenation} exceptions}
@ Labels are given symbolic names by the following definitions, so that
occasional |goto| statements will be meaningful. We insert
the label `|exit|:' just before the `\!|end|\unskip' of a procedure in which we have
used the `|return|' statement defined below;
the label `|restart|' is occasionally used at the very beginning of a
procedure; and the label `|reswitch|' is occasionally used just prior to
a |case| statement in which some cases change the conditions and we wish to
branch to the newly applicable case.
Loops that are set up with the |loop| construction defined below are
commonly exited by going to `|done|' or to `|found|' or to `|not_found|',
and they are sometimes repeated by going to `|continue|'.
Incidentally, this program never declares a label that isn't actually used,
because some fussy \PASCAL\ compilers will complain about redundant labels.
@d exit=10 {go here to leave a procedure}
@d restart=20 {go here to start a procedure again}
@d reswitch=21 {go here to start a case statement again}
@d continue=22 {go here to resume a loop}
@d done=30 {go here to exit a loop}
@d done1=31 {like |done|, but when there is more than one loop}
@d done2=32 {for exiting the second loop in a block}
@d done3=33 {for exiting the third loop in a block}
@d done4=34 {for exiting the fourth loop in a long block}
@d done5=35 {for exiting the fifth loop in a very long block}
@d done6=36 {for exiting the sixth loop in an extremely long block}
@d done7=37 {for exiting the seventh loop in an immense block}
@d done8=38 {for exiting the eighth loop in a block}
@d found=40 {go here when you've found it}
@d not_found=41 {go here when you've found something else}
@ Here are some macros for common programming idioms.
@d incr(#) == #←#+1 {increase a variable by unity}
@d decr(#) == #←#-1 {decrease a variable by unity}
@d loop == @+ while true do@+ {repeat over and over until a |goto| happens}
@f loop == xclause {\.{WEB}'s |xclause| acts like `\!|while true do|\unskip'}
@d do_nothing == {empty statement}
@d return == goto exit {terminate a procedure call}
@f return == nil
@* I/O.
Here I'm rigging up a dummy |get_nc_next| routine to substitute for the
big one in \TEX.
@d lc_code(#)==#
@<Types...@>=
@!ascii_code=0..127;
@ @<Glob...@>=
@!buf:array[0..100] of char;
@!buf_ptr:0..100; {the next character to read}
@!buf_lim:0..100; {the last character in the buffer}
@!cur_chr:ascii_code;
@!cur_cmd:0..10;
@!terminal:boolean; {should input come from the terminal?}
@ @<Set init...@>=
buf_ptr←0; buf_lim←0; terminal←false;
@ @d spacer=1 {code for space}
@d left_brace=2 {code for left brace}
@d right_brace=3 {code for guess what}
@d esc=4 {code for \.{\\}}
@d letter=5 {code for everything else}
@d other_char=6 {unused code}
@p procedure get_nc_next; {inputs next character into |cur_cmd|, |cur_chr|}
begin if buf_ptr>buf_lim then
begin if terminal then
begin write_ln(tty); write(tty,'*'); read_ln(tty);
read(tty,buf:buf_lim);
end
else begin read(input,buf:buf_lim);
read_ln(input);
end;
cur_cmd←spacer; cur_chr←@'40; buf_ptr←0;
end
else begin cur_chr←ord(buf[buf_ptr]); incr(buf_ptr);
case cur_chr of
" ":cur_cmd←spacer;
"{":cur_cmd←left_brace;
"~":cur_cmd←right_brace;
"\":cur_cmd←esc;
othercases cur_cmd←letter
endcases;
end;
end;
@ @d error(#)==begin write(tty,#); terminal←true; repeat get_nc_next;
until cur_chr="*";
end
@p procedure scan_left_brace;
begin repeat get_nc_next;
if (cur_cmd≠spacer)∧(cur_cmd≠left_brace) then error('no left brace!');
until cur_cmd=left_brace;
end;
procedure scan_optional_space;
begin
end;
@* String handling.
Elaborate facilities for dynamic strings are not needed, so all of the
necessary operations can be handled with a simple data structure.
The array |str_pool| contains all of the (seven-bit) ascii codes in all
of the strings, and the array |str_start| contains indices of the starting
points of each string. Strings are referred to by integer numbers, so that
string number |s| comprises the characters |str_pool[j]| for
|str_start[s]≤j<str_start[s+1]|. Additional integer variables
|pool_ptr| and |str_ptr| indicate the number of entries used so far
in |str_pool| and |str_start|, respectively; locations
|str_pool[pool_ptr]| and |str_start[str_ptr]| are
ready for the next string to be allocated.
@<Types...@>=
@!pool_pointer = 0..pool_size; {for variables that point into |str_pool|}
@!str_number = 0..max_strings; {for variables that point into |str_start|}
@ @<Globals...@>=
@!str_pool:packed array[pool_pointer] of ascii_code; {the characters}
@!str_start : array[str_number] of pool_pointer; {the starting pointers}
@!pool_ptr : pool_pointer; {first unused position in |str_pool|}
@!str_ptr : str_number; {start of the current string being created}
@ @<Set init...@>=
str_ptr←1; pool_ptr←0; str_start[1]←0;
@ Several of the elementary string operations are performed using \.{WEB}
macros instead of using \PASCAL\ procedures, because many of the
operations are done quite frequently and we want to avoid the
overhead of procedure calls. For example, here is
a simple macro that computes the length of a string.
@.WEB@>
@d length(#)==(str_start[#+1]-str_start[#]) {the number of characters
in string number \#}
@ Strings are created by appending character codes to |str_pool|.
The |append_char| macro defined here does not check to see if the
value of |pool_ptr| has gotten too high; this test is supposed to be
made before |append_char| is used. There is also a |flush_char|
macro, which erases the last character appended.
@d append_char(#) == {put |ascii_code| \#\ at the end of |str_pool|}
begin str_pool[pool_ptr]←#; incr(pool_ptr);
end
@d flush_char == decr(pool_ptr) {forget the last character in the pool}
@ The length of the current string is called |cur_length|:
@d cur_length == (pool_ptr - str_start[str_ptr])
@ To test if there is room to append |l| more characters to |str_pool|,
we shall write |str_room(l)|, which aborts \TEX\ and gives an
apologetic error message if there isn't enough room.
@d str_room(#) == begin if pool_ptr+# > pool_size then
error('pool overflow!');
end
@ Once a sequence of characters has been appended to |str_pool|, it
officially becomes a string when the function |make_string| is called.
This function returns the identification number of the new string as its
value.
@p function make_string : str_number; {current string enters the pool}
begin if str_ptr=max_strings then
error('string overflow!');
incr(str_ptr); str_start[str_ptr]←pool_ptr;
make_string←str_ptr-1;
end;
@ To destroy the most recently made string, we say |flush_string|.
@d flush_string==begin decr(str_ptr); pool_ptr←str_start[str_ptr];
end
@* Linked memory.
Here I'm trying to get away with a minimum of effort.
@<Types...@>=
@!pointer=0..1000;
@ @d info(#)==mem[#].lh
@d link(#)==mem[#].rh
@d null==0
@<Glob...@>=
@!mem:array[pointer] of two_halves;
@!mem_ptr:pointer;
@ @<Set init...@>=
mem_ptr←null;
@ @p function get_avail:pointer;
begin incr(mem_ptr); get_avail←mem_ptr;
end;
@* Hyphenation.
When a word |hc[1..hn]| has been set up to contain a candidate for hyphenation,
\TEX\ first looks to see if it is in the user's exception dictionary. If not,
hyphens are inserted based on patterns that appear within the given word,
using an algorithm due to Frank M. Liang.
@↑Liang, Franklin Mark@>
Let's consider Liang's method first, since it is more interesting than the
exception-lookup routine. The algorithm begins by setting |hyf[j]| to zero
for all |j|, and invalid `\.{\UA\UA?}' characters are inserted into |hc[0]|
and |hc[hn+1]| to serve as delimiters. Then a reasonably fast method is
used to see which of a given set of patterns occurs in the word
|hc[0..(hn+1)]|. Each pattern $p↓1\ldotsm p↓k$ of length |k| has an associated
sequence of |k+1| numbers $n↓0\ldotsm n↓k$; and if the pattern occurs in
|hc[(j+1)..(j+k)]|, \TEX\ will set |hyf[j+i]←@tmax@>(hyf[j+i],@t$n↓i$@>)| for
|0≤i≤k|. After this has been done for each pattern that occurs, a discretionary
hyphen will be inserted between |hc[j]| and |hc[j+1]| when |hyf[j]| is odd,
as we have already seen.
The set of patterns $p↓1\ldotsm p↓k$ and associated numbers $n↓0\ldotsm n↓k$
depends, of course, on the language whose words are being hyphenated, and
on the degree of hyphenation that is desired. A method for finding
appropriate |p|'s and |n|'s, from a given dictionary of words and acceptable
hyphenations, is discussed in Liang's Ph.D. thesis (Stanford University,
1982); \TEX\ simply starts with the patterns and works from there.
@ The patterns are stored in a compact table that is also efficient for
retrieval, using a variant of ``trie memory'' [cf.\ {\sl The Art of
Computer Programming \bf3} (1973), 481--505]. We can find each pattern
$p↓1\ldotsm p↓k$ by setting |@t$z↓1$@>←@t$p↓1$@>| and then, for |1≤i<k|,
setting |@t$z↓{i+1}$@>←trie_link@t$(z↓i)+p↓{i+1}$@>|; the pattern will be
identified by the number $z↓k$. Since all the pattern information is
packed together into a single |trie_link| array, it is necessary to
prevent confusion between the data from inequivalent patterns, so another
table is provided such that |trie_char@t$(z↓i)=p↓i$@>| for all |i|. There
is also a table |trie_op|$(z↓k)$ to identify the numbers $n↓0\ldotsm n↓k$
associated with $p↓1\ldotsm p↓k$.
Comparatively few different number sequences $n↓0\ldotsm n↓k$ actually occur,
since most of the |n|'s are generally zero. Therefore the number sequences
are encoded in such a way that |trie_op|$(z↓k)$ is only one byte long.
If |trie_op(@t$z↓k$@>)≠min_quarterword|, when $p↓1\ldotsm p↓k$ has matched
the letters in |hc[(l-k+1)..l]|, we perform all of the required operations
for this pattern by carrying out the following little program: Set
|v←trie_op(@t$z↓k$@>)|. Then set |hyf[l-hyf_distance[v]]←@tmax@>(
hyf[l-hyf_distance[v]], hyf_num[v])|, and |v←hyf_next[v]|; repeat, if
necessary, until |v=min_quarterword|.
@d min_halfword=0
@d max_halfword==@'177777
@d min_quarterword=1 {just to try it!}
@d max_quarterword=250 {ditto}
@<Types...@>=
@!small_number=0..63;
@!trie_pointer= 0..trie_size; {an index into |trie|}
@!quarterword=min_quarterword..max_quarterword;
@!halfword=min_halfword..max_halfword;
@!two_choices = 1..2; {used when there are two variants in a record}
@!two_halves = packed record@/
@!rh:halfword;
case two_choices of
1: (@!lh:halfword);
2: (@!b0:quarterword; @!b1:quarterword);
end;
@ @d trie_link(#)==trie[#].rh {``downward'' link in a trie}
@d trie_char(#)==trie[#].b1 {character matched at this trie location}
@d trie_op(#)==trie[#].b0 {program for hyphenation at this trie location}
@<Glob...@>=
@!hc:array[0..65] of halfword; {word to be hyphenated}
@!hn:small_number; {the number of positions occupied in |hc|}
@!hyf:array [0..64] of 0..9; {odd values indicate discretionary hyphens}
@!trie:array[trie_pointer] of two_halves; {|trie_link|, |trie_char|, |trie_op|}
@!hyf_distance:array[quarterword] of small_number; {position |k-j| of $n↓j$}
@!hyf_num:array[quarterword] of small_number; {value of $n↓j$}
@!hyf_next:array[quarterword] of quarterword; {continuation of this |trie_op|}
@ @<Global...@>=
@!z:trie_pointer; {an index into |trie|}
@!v:quarterword; {an index into |hyf_distance|, etc.}
@ Assuming that these auxiliary tables have been set up properly, the
hyphenation algorithm is quite short. In the following code we set |hc[hn+2]|
to the impossible value 256, in order to guarantee that |hc[hn+3]| will
never be fetched.
@<Find hyphen locations for the word in |hc|@>=
for j←0 to hn do hyf[j]←0;
@<Look for the word |hc[1..hn]| in the exception table, and |goto found| (with
|hyf| containing the hyphens) if an entry is found@>;
hc[0]←127; hc[hn+1]←127; hc[hn+2]←256; {insert delimiters}
for j←0 to hn-2 do
begin z←hc[j]; l←j;
while hc[l]=trie_char(z) do
begin if trie_op(z)≠min_quarterword then
@<Update the |hyf| table@>;
incr(l); z←trie_link(z)+hc[l];
end;
end;
found:
@ @<Update the |hyf| table@>=
begin v←trie_op(z);
repeat i←l-hyf_distance[v];
if hyf_num[v]>hyf[i] then hyf[i]←hyf_num[v];
v←hyf_next[v];
until v=min_quarterword;
end
@ The exception table that is built by \TEX's \.{\\hyphenation} primitive is
organized as an ordered hash table [cf.\ Amble and Knuth, {\sl The Computer
@↑Amble, Ole@> @↑Knuth, Donald Ervin@>
Journal\/\ \bf17} (1974), 135--142] using linear probing. If $\alpha$ and
$\beta$ are words, we will say that $\alpha<\beta$ if $\leftv\alpha\rightv<
\leftv\beta\rightv$ or if $\leftv\alpha\rightv=\leftv\beta\rightv$ and
$\alpha$ is lexicographically smaller than $\beta$. (The notation $\leftv
\alpha\rightv$ stands for the length of $\alpha$.) The idea of ordered hashing
is to arrange the table so that a given word $\alpha$ can be sought by computing
a hash address $h=h(\alpha)$ and then looking in table positions $h$, $h-1$,
$\ldotss$, until encountering the first word $\L\alpha$. If this word is
different from $\alpha$, we can conclude that $\alpha$ is not in the table.
The words in the table point to lists in |mem| that specify hyphen positions
in their |info| fields. The list for $c↓1\ldotsm c↓n$ contains $k$ if
the word $c↓1\ldotsm c↓n$ has a discretionary hyphen between $c↓k$ and
$c↓{k+1}$.
@<Types...@>=
@!hyph_pointer=0..hyph_size; {an index into the ordered hash table}
@ @<Glob...@>=
@!hyph_word:array[hyph_pointer] of str_number; {exception words}
@!hyph_list:array[hyph_pointer] of pointer; {list of hyphen positions}
@!hyph_count:hyph_pointer; {the number of words in the exception dictionary}
@ @d init==
@d tini==
@f init==begin
@f tini==end
@<Set init...@>=
init for hyph_count←0 to hyph_size do hyph_word[hyph_count]←0;
hyph_count←0;
tini
@ The algorithm for exception lookup is quite simple, as soon as we have
a few more local variables to work with.
@<Glob...@>=
@!i,@!j,@!l:small_number; {indices into |hc|, etc.}
@!s:pointer; {used somewhere else but I'm declaring it here}
@!h:hyph_pointer; {an index into |hyph_word| and |hyph_list|}
@!k:str_number; {an index into |str_start|}
@!u:pool_pointer; {an index into |str_pool|}
@ First we compute the hash code |h|, then we search until we either
find the word or we don't.
@<Look for the word |hc[1...@>=
h←hc[1];
for j←2 to hn do h←(h+h+hc[j]) mod hyph_size;
while hyph_word[h]≠0 do
begin @<If the string |hyph_word[h]| is less than \(hc)|hc[1..hn]|,
|goto not_found|; but if the two strings are equal,
set |hyf| to the hyphen positions and |goto found|@>;
if h>0 then decr(h)@+else h←hyph_size;
end;
not_found:
@ @<If the string |hyph_word[h]| is less than \(hc)...@>=
k←hyph_word[h];
if length(k)<hn then goto not_found;
if length(k)=hn then
begin j←1; u←str_start[k];
repeat if hc[j]>str_pool[u] then goto not_found;
if hc[j]<str_pool[u] then goto done8;
incr(j); incr(u);
until j>hn;
@<Insert hyphens as specified in |hyph_list[h]|@>;
goto found;
end;
done8:
@ @<Insert hyphens as specified...@>=
s←hyph_list[h];
while s≠null do
begin hyf[info(s)]←1; s←link(s);
end
@ We have now completed the hyphenation routine, so the |line_break| procedure
is finished at last. Since the hyphenation exception table is fresh in our
minds, it's a good time to deal with the routine that adds new entries to it.
When \TEX\ has scanned the `\.{\\hyphenation}' control sequence, it calls
on a procedure named |new_hyph_exceptions| to do the right thing.
@p procedure new_hyph_exceptions; {enters new exceptions}
label exit, found, not_found;
var n:small_number; {length of current word}
@!j:small_number; {an index into |hc|}
@!p:pointer; {head of a list of hyphen positions}
@!q:pointer; {used when creating a new node for list |p|}
@!s,@!t:str_number; {strings being compared or stored}
@!u,@!v:pool_pointer; {indices into |str_pool|}
begin scan_left_brace; {a left brace must follow \.{\\hyphenation}}
@<Enter as many hyphenation exceptions as are listed,
until coming to a right brace; then skip an optional space and |return|@>;
exit:end;
@ @<Enter as many...@>=
n←0; p←null;
loop@+ begin get_nc_next;
case cur_cmd of
letter,other_char:@<Append a new letter or hyphen@>;
spacer,right_brace: begin if n>4 then @<Enter a hyphenation exception@>;
if cur_cmd=right_brace then
begin scan_optional_space; return;
end;
n←0; p←null;
end;
othercases @<Give improper \.{\\hyphenation} error and abort@>
endcases;
end;
@ @<Give improper \.{\\hyph...@>=
begin error('improper \hyphenation!');
end
@ @<Append a new letter or hyphen@>=
if cur_chr="-" then @<Append the value |n| to list |p|@>
else begin if lc_code(cur_chr)=0 then
begin error('not a letter!');
end
else if n<63 then
begin incr(n); hc[n]←lc_code(cur_chr);
end;
end
@ @<Append the value |n| to list |p|@>=
begin if n>1 then
begin q←get_avail; link(q)←p; info(q)←n; p←q;
end;
end
@ @<Enter a hyphenation exception@>=
begin str_room(n); h←0;
for j←1 to n do
begin h←(h+h+hc[j]) mod hyph_size;
append_char(hc[j]);
end;
s←make_string;
while info(p)>n-3 do p←link(p); {eliminate hyphens \TEX\ doesn't like}
@<Insert |(s,p)| into the exception table@>;
end
@ @<Insert |(s,p)|...@>=
if hyph_count=hyph_size then error('exception overflow!');
incr(hyph_count);
while hyph_word[h]≠0 do
begin @<If the string |hyph_word[h]| is less than \(or)or equal to
|s|, interchange |(hyph_word[h],hyph_list[h])| with |(s,p)|@>;
if h>0 then decr(h)@+else h←hyph_size;
end;
hyph_word[h]←s; hyph_list[h]←p
@ @<If the string |hyph_word[h]| is less than \(or)...@>=
k←hyph_word[h];
if length(k)<length(s) then goto found;
if length(k)>length(s) then goto not_found;
u←str_start[k]; v←str_start[s];
repeat if str_pool[u]<str_pool[v] then goto found;
if str_pool[u]>str_pool[v] then goto not_found;
incr(u); incr(v);
until u=str_start[k+1];
found:q←hyph_list[h]; hyph_list[h]←p; p←q;
t←hyph_word[h]; hyph_word[h]←s; s←t;
not_found:
@* Initializing the hyphenation tables.
The trie for \TEX's hyphenation algorithm is built from a sequence of
patterns following a \.{\\patterns} specification. Such a specification
is allowed only in \.{INITEX}, since the extra memory for auxiliary tables
and for the initialization program itself would only clutter up the
production version of \TEX\ with a lot of deadwood.
The initialization first builds a trie that is linked instead of packed
into sequential storage, so that insertions are readily made. Then it
compresses the linked trie by identifying common subtries, and finally the
trie is packed into the efficient sequential form that the hyphenation
algorithm actually uses.
@p init @<Declaration of procedures for preprocessing hyphenation patterns@>@;
tini
@ Before we discuss trie building in detail, let's consider the simpler
problem of creating the |hyf_distance|, |hyf_num|, and |hyf_next| arrays.
Suppose, for example, that \TEX\ reads the pattern `\.{ab2cde1}'. This
is a pattern of length 5, with $n↓0\ldotsm n↓5=0\,0\,2\,0\,0\,1$ in the
notation above. We want the corresponding |trie_op| code |v| to have
|hyf_distance[v]=3|, |hyf_num[v]=2|, and |hyf_next[v]=@t$v↑\prime$@>|,
where |hyf_distance[@t$v↑\prime$@>]=0|, |hyf_num[@t$v↑\prime$@>]=1|,
and |hyf_next[@t$v↑\prime$@>]=min_quarterword|.
\TEX\ computes an appropriate value |v| with the |new_trie_op| subroutine
below, by setting |@t$v↑\prime$@>←new_trie_op(0,1,min_quarterword)|, then
|v←new_trie_op(3,2,@t$v↑\prime$@>)|. This subroutine looks up its three
parameters in a special hash table, assigning a new value only if these
three have not appeared before.
The hash table is called |trie_op_hash|, and the number of entries it contains
is |trie_op_ptr|. If the table overflows, the excess ops are not stored,
and |trie_op_ptr=max_quarterword|.
@d quarterword_diff=max_quarterword-min_quarterword
@d trie_op_hash_size=quarterword_diff+quarterword_diff {double}
@<Glob...@>=
init@! trie_op_hash:array[0..trie_op_hash_size] of quarterword;
{trie op codes for triples}
@t\hskip1em@>@!trie_op_ptr:quarterword; {highest |trie_op| assigned}
tini
@ The hash function used by |new_trie_op| is based on the idea that
313/510 is an approximation to the golden ratio [cf.\ {\sl The Art of
Computer Programming \bf3} (1973), 510--512]; but the choice is
comparatively unimportant in this particular application.
@<Declaration of procedures for preprocessing hyph...@>=
function new_trie_op(@!d,@!n:small_number;@!v:quarterword):quarterword;
label exit;
var h:0..trie_op_hash_size; {trial hash location}
@!u:quarterword; {trial op code}
begin h←((n+313*d+361*v) mod trie_op_hash_size);
loop@+ begin u←trie_op_hash[h];
if u=min_quarterword then {empty position found}
begin if trie_op_ptr≥(max_quarterword-1) then {overflow}
begin trie_op_ptr←max_quarterword;
new_trie_op←min_quarterword; return;
end;
incr(trie_op_ptr); hyf_distance[trie_op_ptr]←d;
hyf_num[trie_op_ptr]←n; hyf_next[trie_op_ptr]←v;
trie_op_hash[h]←trie_op_ptr;
new_trie_op←trie_op_ptr; return;
end;
if (hyf_distance[u]=d)∧(hyf_num[u]=n)∧(hyf_next[u]=v) then
begin new_trie_op←u; return;
end;
if h>min_quarterword then decr(h) else h←max_quarterword;
end;
exit:end;
@ The linked trie that is used to preprocess hyphenation patterns appears
in several global arrays. Each node represents an instruction of the form
``if you see character |c|, then perform operation |o|, move to the
next character, and go to node |l|; otherwise go to node |r|.''
The four quantities |c|, |o|, |l|, and |r| are stored in four arrays
|trie_c|, |trie_o|, |trie_l|, and |trie_r|. The root of the trie
is |trie_l[0]|, and the number of nodes is |trie_ptr|. Null trie
pointers are represented by zero. To initialize the trie, we simply
set |trie_l[0]| and |trie_c[0]| and |trie_ptr| to zero.
The algorithms maintain the condition |trie_c[trie_r[z]]>trie_c[z]|
whenever |z≠0| and |trie_r[z]≠0|; in other words, sibling nodes are
ordered by their |c| fields.
@d trie_root==trie_l[0] {root of the linked trie}
@<Globals...@>=
init @!trie_c:array[trie_pointer] of ascii_code; {characters to match}
@t\hskip1em@>@!trie_o:array[trie_pointer] of quarterword; {operations to
perform}
@t\hskip1em@>@!trie_l:array[trie_pointer] of trie_pointer; {left subtrie links}
@t\hskip1em@>@!trie_r:array[trie_pointer] of trie_pointer; {right subtrie links}
@t\hskip1em@>@!trie_ptr:trie_pointer; {the number of nodes in the trie}
tini
@ Let us suppose that a linked trie has already been constructed.
Experience shows that we can often reduce its size by recognizing common
subtries; therefore another hash table is introduced for this purpose,
somewhat similar to |trie_op_hash|. The new hash table will be
initialized to zero.
@<Glob...@>=
init @!trie_hash:array[trie_pointer] of trie_pointer; {to identify equivalent
subtries}
tini
@ The function |trie_node(p)| returns |p| if |p| is distinct from other nodes
that it has seen, otherwise it returns the number of the first equivalent
node that it has seen.
@<Declaration of procedures for preprocessing hyph...@>=
function trie_node(@!p:trie_pointer):trie_pointer; {converts
to a canonical form}
label exit;
var h:trie_pointer; {trial hash location}
@!q:trie_pointer; {trial trie node}
begin h←(trie_c[p]+1009*trie_o[p]+2718*trie_l[p]+3142*trie_r[p]) mod trie_size;
loop@+ begin q←trie_hash[h];
if q=0 then
begin trie_hash[h]←p; trie_node←p; return;
end;
if (trie_c[q]=trie_c[p])∧(trie_o[q]=trie_o[p])∧@|
(trie_l[q]=trie_l[p])∧(trie_r[q]=trie_r[p]) then
begin trie_node←q; return;
end;
if h>0 then decr(h)@+else h←trie_size;
end;
exit:end;
@ A neat recursive procedure is now able to compress a trie by
traversing it and applying |trie_node| to its nodes in ``bottom up''
fashion. We will compress the entire trie by clearing |trie_hash| to
zero and then saying `|trie_root←compress_trie(trie_root)|'.
@↑recursion@>
@<Declaration of procedures for preprocessing hyph...@>=
function compress_trie(@!p:trie_pointer):trie_pointer;
begin if p=0 then compress_trie←0
else begin trie_l[p]←compress_trie(trie_l[p]);
trie_r[p]←compress_trie(trie_r[p]);
compress_trie←trie_node(p);
end;
end;
@ Before we forget how to initialize the data structures that have been
mentioned so far, let's write a procedure that does the initialization.
@<Declaration of procedures for preprocessing hyph...@>=
procedure init_pattern_memory; {gets ready to build a linked trie}
var h:0..trie_op_hash_size; {an index into |trie_op_hash|}
@!p:trie_pointer; {an index into |trie_hash|}
begin for h←0 to trie_op_hash_size do trie_op_hash[h]←min_quarterword;
trie_op_ptr←min_quarterword; trie_root←0; trie_c[0]←0; trie_ptr←0;
for p←0 to trie_size do trie_hash[p]←0;
end;
@ The compressed trie will be packed into the |trie| array using a
``top-down first-fit'' procedure. This is a little tricky, so the reader
should pay close attention: The |trie_hash| array is cleared to zero
again and renamed |trie_ref| for this phase of the operation; later on,
|trie_ref[p]| will be nonzero if the linked trie node |p| is the oldest sibling
in a family and if the characters |c| of that family have been allocated to
locations |trie_ref[p]+c| in the |trie| array. Locations of |trie| that
are in use will have |trie_link=0|, while the unused holes in |trie|
will be doubly linked with |trie_link| pointing to the next larger vacant
location and |trie_back| pointing to the next smaller one. This double
linking will have been carried out only as far as |trie_max|, where
|trie_max| is the largest index of |trie| that will be needed.
Another array |trie_taken| tells whether or not a given location is
equal to |trie_ref[p]| for some |p|; this array is used to ensure that
distinct nodes in the compressed trie will have distinct |trie_ref|
entries.
@d trie_ref==trie_hash {where linked trie families go into |trie|}
@d trie_back(#)==trie[#].lh {backward links in |trie| holes}
@<Glob...@>=
init@!trie_taken:array[trie_pointer] of boolean; {does a family start here?}
@t\hskip1em@>@!trie_max:trie_pointer; {largest location used in |trie|}
@t\hskip1em@>@!trie_min:trie_pointer; {all locations |≤trie_min| are vacant in |trie|}
tini
@ Here is how these data structures are initialized.
@<Declaration of procedures for preprocessing hyph...@>=
procedure init_trie_memory; {gets ready to pack into |trie|}
var p:trie_pointer; {index into |trie_ref|, |trie|, |trie_taken|}
begin for p←0 to trie_ptr do trie_ref[p]←0;
trie_max←128; trie_min←128; trie_link(0)←1; trie_taken[0]←false;
for p←1 to 128 do
begin trie_back(p)←p-1; trie_link(p)←p+1; trie_taken[p]←false;
end;
end;
@ Each time \.{\\patterns} appears, it overrides any patterns that were
entered earlier, so the arrays are not initialized until \TEX\ sees
\.{\\patterns}. However, some of the global variables must be
initialized when \.{INITEX} is loaded, in case the user never mentions
any \.{\\patterns}.
@<Set init...@>=
init trie_op_ptr←min_quarterword; trie_max←0;
tini
@ The |first_fit| procedure finds the smallest hole |z| in |trie| such that
a trie family starting at a given node |p| will fit into vacant positions
starting at |z|. If |c=trie_c[p]|, this means that location |z-c| must
not already be taken by some other family, and that |z-c+@t$c↑\prime$@>|
must be vacant for all characters $c↑\prime$ in the family. The procedure
sets |trie_ref[p]| to |z-c| when the first fit has been found.
@<Declaration of procedures for preprocessing hyph...@>=
procedure first_fit(@!p:trie_pointer); {packs a family into |trie|}
label not_found,found;
var h:trie_pointer; {candidate for |trie_ref[p]|}
@!z:trie_pointer; {runs through holes}
@!q:trie_pointer; {runs through the family starting at |p|}
@!c:ascii_code; {smallest character in the family}
begin c←trie_c[p]; {we have |c≠0|}
if c<trie_min then trie_min←c;
z←trie_link(trie_min-1); {get the first conceivably good hole}
loop@+ begin if z<c then goto not_found;
h←z-c; @<Ensure that |trie_max≥h+128|@>;
if trie_taken[h] then goto not_found;
@<If all characters of the family fit relative to |h|, then
|goto found|,\30\ otherwise |goto not_found|@>;
not_found: z←trie_link(z); {move to the next hole}
end;
found: @<Pack the family into |trie| relative to |h|@>;
end;
@ By making sure that |trie_max| is at least |h+128|, we can be sure that
|trie_max>z|, since |h=z+c|. It follows that location |trie_max| will
never be occupied in |trie|, and we will have |trie_max≥trie_link(z)|.
@<Ensure that |trie_max≥h+128|@>=
if trie_max<h+128 then
begin if trie_size≤h+128 then error('pattern overflow!');
repeat incr(trie_max); trie_taken[trie_max]←false;
trie_link(trie_max)←trie_max+1; trie_back(trie_max)←trie_max-1;
until trie_max=h+128;
end
@ @<If all characters of the family fit relative to |h|...@>=
q←trie_r[p];
while q>0 do
begin if trie_link(h+trie_c[q])=0 then goto not_found;
q←trie_r[q];
end;
goto found
@ @<Pack the family into |trie| relative to |h|@>=
trie_taken[h]←true; trie_ref[p]←h; q←p;
repeat z←h+trie_c[q]; trie_back(trie_link(z))←trie_back(z);
trie_link(trie_back(z))←trie_link(z); trie_link(z)←0; q←trie_r[q];
until q=0
@ To pack the entire linked trie, we use the following recursive procedure.
@↑recursion@>
@<Declaration of procedures for preprocessing hyph...@>=
procedure trie_pack(@!p:trie_pointer); {pack subtries of a family}
var q:trie_pointer; {a local variable that need not be saved on recursive calls}
begin repeat q←trie_l[p];
if (q>0)∧(trie_ref[q]=0) then
begin first_fit(q); trie_pack(q);
end;
p←trie_r[p];
until p=0;
end;
@ When the whole trie has been allocated into the sequential table, we
must go through it once again so that |trie| contains the correct
information. Null pointers in the linked trie will be replaced by the
first untaken position |r| in |trie|, since this properly implements an
``empty'' family. The value of |r| is stored in |trie_ref[0]| just before
the fixup process starts. Note that |trie_max| will always be at least as
large as |r+127|, since it is always at least 128 more than each location
that is taken.
@<Move the data into |trie|@>=
r←0;
while trie_taken[r] do incr(r);
trie_ref[0]←r; {|r| will be used for null pointers}
trie_fix(trie_root); {this fixes the non-holes in |trie|}
@ The fixing-up procedure is, of course, recursive. Since the linked trie
usually has overlapping subtries, the same data may be moved several
times; but that causes no harm, and at most as much work is done as it
took to build the uncompressed trie.
@↑recursion@>
@<Declaration of procedures for preprocessing hyph...@>=
procedure trie_fix(@!p:trie_pointer); {moves |p| and its siblings into |trie|}
var q:trie_pointer; {a local variable that need not be saved on recursive calls}
@!c:ascii_code; {another one that need not be saved}
@!z:trie_pointer; {|trie| reference; this local variable must be saved}
begin z←trie_ref[p];
while p≠0 do
begin q←trie_l[p]; c←trie_c[p];
trie_link(z+c)←trie_ref[q]; trie_char(z+c)←c; trie_op(z+c)←trie_o[p];
if q>0 then trie_fix(q);
p←trie_r[p];
end;
end;
@ Now let's put all these routines together. When \.{INITEX} has scanned
the `\.{\\patterns}' control sequence, it calls on |new_patterns| to do
the right thing. After |new_patterns| has acted, the desired pattern data
appears in |trie[1..trie_max]|, and the associated numeric data appears in
locations |[(min_quarterword+1).. trie_op_ptr]| of the arrays
|hyf_distance|, |hyf_num|, |hyf_next|.
@<Declaration of procedures for preprocessing hyph...@>=
procedure new_patterns; {initializes the hyphenation pattern data}
label done, done1;
var k,@!l:small_number; {indices into |hc| and |hyf|}
@!digit_sensed:boolean; {should the next digit be treated as a letter?}
@!v:quarterword; {trie op code}
@!p,@!q:trie_pointer; {nodes of trie traversed during insertion}
@!first_child:boolean; {is |p=trie_l[q]|?}
@!c:ascii_code; {character being inserted}
@!r,@!s:trie_pointer; {used to clean up the packed |trie|}
@!h:two_halves; {template used to zero out |trie|'s holes}
begin scan_left_brace; {a left brace must follow \.{\\patterns}}
init_pattern_memory;@/
@<Enter all of the patterns into a linked trie, until coming to a right
brace; then skip an optional space and |goto done|@>;
done: trie_root←compress_trie(trie_root); {compress the trie}
@<Pack the trie@>;
write(tty,' Hyphenation table sizes are ',trie_max:0,';',
trie_op_ptr-min_quarterword:0,'.');
end;
@ Novices are not supposed to be using \.{\\patterns}, so the error
messages are terse. (Note that all error messages appear in \TEX's string
pool, even if they are used only by \.{INITEX}.)
@<Enter all of the patterns into a linked trie...@>=
k←0; hyf[0]←0; digit_sensed←false;
loop@+ begin get_nc_next;
case cur_cmd of
letter,other_char:@<Append a new letter or a hyphen level@>;
spacer,right_brace: begin if k>0 then
@<Insert a new pattern into the linked trie@>;
if cur_cmd=right_brace then
begin scan_optional_space; goto done;
end;
k←0; hyf[0]←0; digit_sensed←false;
end;
othercases begin error('Bad \patterns!');
goto done;
end
endcases;
end
@ @<Append a new letter or a hyphen level@>=
if digit_sensed ∨(cur_chr<"0")∨(cur_chr>"9") then
begin if cur_chr="." then cur_chr←127 {change edge-of-word delimiter
to |@'177|}
else begin cur_chr←lc_code(cur_chr);
if cur_chr=0 then
begin error('Illegal character!');
cur_chr←127;
end;
end;
if k<63 then
begin incr(k); hc[k]←cur_chr; hyf[k]←0; digit_sensed←false;
end;
end
else begin hyf[k]←cur_chr-"0"; digit_sensed←true;
end
@ When the following code comes into play, the pattern $p↓1\ldotsm p↓k$
appears in |hc[1..k]|, and the corresponding sequence of numbers $n↓0\ldotsm
n↓k$ appears in |hyf[0..k]|.
@<Insert a new pattern into the linked trie@>=
begin @<Compute the trie op code, |v|, and set |l←0|@>;
q←0; first_child←true;
while l<k do
begin incr(l); c←hc[l]; first_child←true; p←trie_l[q];
while (p>0)∧(c>trie_c[p]) do
begin q←p; p←trie_r[q]; first_child←false;
end;
if (p=0)∨(c<trie_c[p]) then
@<Insert a new trie node between |q| and |p|, and
make |p| point to it@>;
q←p; {now node |q| represents $p↓1\ldotsm p↓l$}
end;
if trie_o[q]≠min_quarterword then
begin error('Duplicate pattern!');
end;
trie_o[q]←v;
end
@ @<Insert a new trie node between |q| and |p|...@>=
begin if trie_ptr=trie_size then error('pattern overflow');
incr(trie_ptr); trie_r[trie_ptr]←p; p←trie_ptr; trie_l[p]←0;
if first_child then trie_l[q]←p@+else trie_r[q]←p;
trie_c[p]←c; trie_o[p]←min_quarterword;
end
@ @<Compute the trie op code, |v|...@>=
l←k; v←min_quarterword;
loop@+ begin if hyf[l]≠0 then v←new_trie_op(k-l,hyf[l],v);
if l>0 then decr(l)@+else goto done1;
end;
done1:
@ The following packing routine is rigged so that the root of the linked
tree gets mapped into location 0 of |trie|, as required by the hyphenation
algorithm. This happens because the first call of |first_fit| will
``take'' location@@0.
@<Pack the trie@>=
init_trie_memory;
if trie_root≠0 then
begin first_fit(trie_root); trie_pack(trie_root);
end;
@<Move the data into |trie|@>;
r←0;
h.rh←0; h.b0←min_quarterword; h.b1←128; {finally, we will zero out the holes}
repeat s←trie_link(r); trie[r]←h; r←s;
until r>trie_max
@* The main program.
@p begin
initialize; hn←0;
repeat get_nc_next;
case cur_cmd of
esc: begin get_nc_next;
if cur_chr="h" then new_hyph_exceptions
else if cur_chr="p" then new_patterns
else if cur_chr="t" then terminal←not terminal
else if cur_chr="q" then goto 9999
else error('Undefined control sequence!');
end;
letter: begin incr(hn); hc[hn]←cur_chr; end;
spacer: @<Hyphenate the previous word, if any@>;
othercases error('Syntax error!')
endcases;
until false;
9999:end.
@ @<Hyphenate...@>=
begin if hn>4 then
begin @<Find hyphen locations for the word in |hc|@>;
write_ln(tty);
for k←1 to hn do
begin write(tty,chr(hc[k]));
if odd(hyf[k]) then write(tty,'-')
else if hyf[k]>0 then write(tty,'=');
end;
end;
hn←0;
end
@* Index.